suppressPackageStartupMessages(suppressWarnings({
    library(simulateGP)
    library(tidyverse)
  library(data.table)
  library(knitr)
}))
opts_chunk$set(cache=TRUE, echo=TRUE, message=FALSE, warning=FALSE)

Simulation function

runsim <- function(p)
{
    map <- tibble(snp=1:p$nsnp, af=runif(p$nsnp, 0.01, 0.99))
    gp <- generate_gwas_params(map, h2=p$h2, S = p$S) %>%
        mutate(
            rsq=2 * beta^2 * af * (1-af)
        )
    ss <- generate_gwas_ss(gp, nid=p$nid) %>%
        mutate(tfval = (gp$beta/se)^2)
    p$nsig <- sum(ss$pval < p$pval)
    p$nsig_weak <- sum(ss$pval < p$pval & ss$tfval < 20)
    p$nsig_overestimate <- sum(ss$pval < p$pval & (abs(ss$bhat) - 1.96 * ss$se) > abs(gp$beta))
    p$min_rsq <- subset(ss, pval < p$pval) %>% {min(.$rsq)}
    return(p)
}

Setup parameters

param <- expand.grid(
    nid=seq(10000, 250000, by=10000),
    nsnp=c(500, 1000, 5000, 10000, 50000, 100000),
    h2=c(0.1, 0.5, 0.9),
    pval=5e-8,
    S=c(-1, 0, 1),
    rep=1:10
) %>% as_tibble()
str(param)
## tibble [13,500 × 6] (S3: tbl_df/tbl/data.frame)
##  $ nid : num [1:13500] 1e+04 2e+04 3e+04 4e+04 5e+04 6e+04 7e+04 8e+04 9e+04 1e+05 ...
##  $ nsnp: num [1:13500] 500 500 500 500 500 500 500 500 500 500 ...
##  $ h2  : num [1:13500] 0.1 0.1 0.1 0.1 0.1 0.1 0.1 0.1 0.1 0.1 ...
##  $ pval: num [1:13500] 5e-08 5e-08 5e-08 5e-08 5e-08 5e-08 5e-08 5e-08 5e-08 5e-08 ...
##  $ S   : num [1:13500] -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 ...
##  $ rep : int [1:13500] 1 1 1 1 1 1 1 1 1 1 ...
##  - attr(*, "out.attrs")=List of 2
##   ..$ dim     : Named int [1:6] 25 6 3 1 3 10
##   .. ..- attr(*, "names")= chr [1:6] "nid" "nsnp" "h2" "pval" ...
##   ..$ dimnames:List of 6
##   .. ..$ nid : chr [1:25] "nid= 10000" "nid= 20000" "nid= 30000" "nid= 40000" ...
##   .. ..$ nsnp: chr [1:6] "nsnp=5e+02" "nsnp=1e+03" "nsnp=5e+03" "nsnp=1e+04" ...
##   .. ..$ h2  : chr [1:3] "h2=0.1" "h2=0.5" "h2=0.9"
##   .. ..$ pval: chr "pval=5e-08"
##   .. ..$ S   : chr [1:3] "S=-1" "S= 0" "S= 1"
##   .. ..$ rep : chr [1:10] "rep= 1" "rep= 2" "rep= 3" "rep= 4" ...

Run simulations

l <- list()
for(i in 1:nrow(param))
{
    l[[i]] <- runsim(param[i,])
}
l <- bind_rows(l)

What fraction of discovered SNPs will be significantly overestimated for a given minimum rsq value?

ggplot(l, aes(x=nid, y=nsig_overestimate / nsig)) +
geom_point(aes(colour=as.factor(nsnp))) +
geom_smooth(aes(colour=as.factor(nsnp))) +
scale_x_log10() +
  ylim(c(0,1)) +
scale_colour_brewer(type="qual") +
facet_grid(h2 ~ .)

And what fraction will be weak?

ggplot(l, aes(x=nid, y=nsig_weak / nsig)) +
geom_point(aes(colour=as.factor(nsnp))) +
geom_smooth(aes(colour=as.factor(nsnp))) +
scale_x_log10() +
scale_colour_brewer(type="qual") +
facet_grid(h2 ~ S)

Relationship between overestimates and weak? As power gets better more of the overestimates are still strong

ggplot(l, aes(x=nsig_overestimate, y=nsig_weak)) +
geom_point(aes(colour=as.factor(nsnp))) +
geom_smooth(aes(colour=as.factor(nsnp))) +
scale_x_log10() +
scale_colour_brewer(type="qual") +
facet_grid(h2 ~ S)

How many just overestimated

ggplot(l, aes(x=nid, y=nsig_overestimate)) +
geom_point(aes(colour=as.factor(nsnp))) +
geom_smooth(aes(colour=as.factor(nsnp))) +
scale_x_log10() +
scale_colour_brewer(type="qual") +
facet_grid(h2 ~ S)

ggplot(l, aes(x=nid, y=nsig_overestimate/nsig)) +
geom_point(aes(colour=as.factor(nsnp))) +
geom_smooth(aes(colour=as.factor(nsnp))) +
scale_x_log10() +
scale_colour_brewer(type="qual") +
facet_grid(h2 ~ S)

ggplot(l, aes(x=nid, y=min_rsq)) +
geom_point(aes(colour=as.factor(nsnp))) +
geom_smooth(aes(colour=as.factor(nsnp))) +
scale_colour_brewer(type="qual") +
facet_grid(h2 ~ S)

ggplot(l, aes(x=nid, y=nsig)) +
geom_point(aes(colour=as.factor(nsnp))) +
geom_smooth(aes(colour=as.factor(nsnp))) +
scale_colour_brewer(type="qual") +
facet_grid(h2 ~ S)

ggplot(l, aes(x=nsig, y=nsig_overestimate/nsig)) +
geom_point(aes(colour=as.factor(nsnp))) +
geom_smooth(aes(colour=as.factor(nsnp))) +
scale_colour_brewer(type="qual") +
facet_grid(h2 ~ S)


Sample split GWAS

load("../results/instrument_list.rdata")
counts <- fread("../results/count_tophits.txt")
instrument_counts <- subset(instrument_counts, !is.na(BETA.disc))
weak_p <- pf(10, 1, 100000, low=FALSE)

Quick summary

s1 <- instrument_counts %>%
    group_by(id) %>%
    summarise(
        ndisc=sum(P_BOLT_LMM_INF.disc < 5e-8),
        nrepl=sum(P_BOLT_LMM_INF.repl < 5e-8),
        nhalf=mean(ndisc, nrepl),
        ntot=sum(P_BOLT_LMM_INF.disc < 5e-8 | P_BOLT_LMM_INF.repl < 5e-8),
        ndisc_weak=sum(P_BOLT_LMM_INF.disc < 5e-8 & P_BOLT_LMM_INF.repl > weak_p),
        nrepl_weak=sum(P_BOLT_LMM_INF.repl < 5e-8 & P_BOLT_LMM_INF.disc > weak_p),
        ndisc_overestimated=sum(P_BOLT_LMM_INF.disc < 5e-8 & (abs(BETA.disc) - 1.96 * SE.disc) > abs(BETA.repl)),
        nrepl_overestimated=sum(P_BOLT_LMM_INF.repl < 5e-8 & (abs(BETA.repl) - 1.96 * SE.repl) > abs(BETA.disc)),
    nhalf_overestimated=mean(ndisc_overestimated, nrepl_overestimated),
        nhalf_weak=mean(ndisc_weak, nrepl_weak)
    )

Number of traits with an instrument

sum(s1$nhalf > 0)
## [1] 590

Total number of instruments

sum(s1$nhalf)
## [1] 13673
sum(s1$ndisc)
## [1] 13673
sum(s1$nrepl)
## [1] 13723
sum(s1$ntot)
## [1] 18482

how many instruments per trait

summary(s1$ndisc)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    0.00    0.00    1.00   12.13    4.00  477.00
summary(s1$nrepl)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    0.00    0.00    1.00   12.18    3.00  460.00
summary(s1$ndisc[s1$ndisc != 0])
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    1.00    1.00    3.00   23.17   11.00  477.00
summary(s1$nrepl[s1$nrepl != 0])
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    1.00    1.00    3.00   23.54   11.00  460.00

Number of weak instruments

sum(s1$nhalf_weak)
## [1] 734
sum(s1$ndisc_weak)
## [1] 734
sum(s1$nrepl_weak)
## [1] 628

Number of overestimated instruments

sum(s1$nhalf_overestimated)
## [1] 2623
sum(s1$ndisc_overestimated)
## [1] 2623
sum(s1$nrepl_overestimated)
## [1] 2555

Overestimated instruments

summary(s1$nhalf_overestimated[s1$nhalf != 0])
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   0.000   0.000   1.000   4.446   3.000  56.000
summary(s1$ndisc_overestimated[s1$ndisc != 0])
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   0.000   0.000   1.000   4.446   3.000  56.000
summary(s1$nrepl_overestimated[s1$nrepl != 0])
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   0.000   0.000   1.000   4.383   4.000  64.000
hist(s1$nhalf_overestimated / s1$nhalf, breaks=100)

hist(s1$nhalf_weak / s1$nhalf, breaks=100)

ggplot(s1, aes(x=ndisc, y=ndisc_overestimated / ndisc)) +
  geom_point()

ggplot(s1, aes(x=ndisc, y=ndisc_weak / ndisc)) +
  geom_point()

p1 <- bind_rows(
  l %>% dplyr::select(nsig=nsig, nsnp=nsnp, nsig_overestimate) %>% mutate(what="Simulations"),
  s1 %>% dplyr::select(nsig=ndisc, nsig_overestimate=ndisc_overestimated) %>% mutate(what="UK Biobank")
) %>%
  {
ggplot(., aes(x=nsig, y=nsig_overestimate/nsig)) +
  geom_point(data=subset(., what !="UK Biobank"), aes(colour=as.factor(nsnp)), size=0.1, alpha=0.5) +
  geom_smooth(data=subset(., what !="UK Biobank"), aes(colour=as.factor(nsnp)), se=FALSE) +
  geom_point(data=subset(., what =="UK Biobank")) +
  scale_colour_brewer(type="seq", palette=2) +
  # scale_x_log10() +
      xlim(c(0,500)) +
  labs(x="Discovered associations", y="Proportion overestimated", colour="Number of causal variants", shape="")
    
  }
p1

s1 %>% 
  mutate(propover=ndisc_overestimated/ndisc, propweak=ndisc_weak/ndisc) %>%
  dplyr::select(id, propover, propweak) %>%
  pivot_longer(cols=c(propover, propweak)) %>%
  ggplot(., aes(x=value)) +
  geom_histogram(aes(fill=name), position="dodge", bins=100)

  # geom_density(aes(fill=name), alpha=0.2)
ggplot(s1, aes(x=ndisc_overestimated/ndisc)) +
  geom_histogram(bins=100)